## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [conflicted] Will prefer dplyr::filter over any other package.
## [conflicted] Will prefer dplyr::select over any other package.
## [conflicted] Will prefer dplyr::slice over any other package.
## [conflicted] Will prefer dplyr::rename over any other package.
## [conflicted] Will prefer dplyr::intersect over any other package.
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
mytheme = theme_bw(base_size = 10) +
theme(text = element_text(size=10, colour='black'),
title = element_text(size=10, colour='black'),
line = element_line(size=0.5),
axis.title = element_text(size=10, colour='black'),
axis.text = element_text(size=10, colour='black'),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(size=0.5),
strip.background = element_blank(),
strip.text.y = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank(),
legend.position = c(0.8, 0.8),
legend.text=element_text(size=10)) ## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 32 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (6): nr_detected, nr_expected, sensitivity, perc_compound_val, total_n, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2303689 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group,...
## dbl (9): start, end, count_UT, count_T, nr_reads_UT, nr_reads_T, cpm_T, cpm_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ$tool = factor(all_circ$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
cq$tool = factor(cq$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
val$tool = factor(val$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
sens$tool = factor(sens$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
cq## Rows: 192 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): chr, strand, BSJ_count_int, circ_id
## dbl (5): start, end, BSJ_count, Cq, Cq_mean_PCR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq_cum = cq_test %>%
select(Cq_mean_PCR, BSJ_count, circ_id) %>%
unique() %>%
group_by(BSJ_count) %>%
arrange(desc(BSJ_count)) %>%
summarize(total_n = n(), val_n = sum(Cq_mean_PCR < 32)) %>%
ungroup() %>%
arrange(desc(BSJ_count)) %>%
mutate(total_n_cum = cumsum(total_n),
val_n_cum = cumsum(val_n)) %>%
mutate(perc_val = val_n_cum/total_n_cum)
cq_cumcq_cum %>%
ggplot(aes(BSJ_count, perc_val)) +
geom_point(size = 1) +
geom_vline(xintercept = 5, color = "#999999", linetype = 'dashed') +
geom_smooth(color = '#5AB4E5') +
scale_x_continuous(trans='log10') +
scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
mytheme## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 24 rows containing missing values (`geom_smooth()`).
## Rows: 15594 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): chr, strand, cell_line, tool, circ_id, circ_id_strand, circ_ID
## dbl (11): BSJ_count, start, end, design, primer_found, total_primer_pairs, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# showing how many primers failed
primer_design$tool = factor(primer_design$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
primer_design %>%
mutate(fail_reason = ifelse(total_primer_pairs == failed_spec, 'failed specificity', NA),
fail_reason = ifelse(total_primer_pairs == failed_SNP, 'failed SNPs', fail_reason),
fail_reason = ifelse(total_primer_pairs == failed_sec_str_amp, 'failed sec str amp', fail_reason),
fail_reason = ifelse(total_primer_pairs == failed_sec_str_temp, 'failed sec str temp', fail_reason),
fail_reason = ifelse(is.na(fail_reason), 'failed sec str amp', fail_reason),
fail_reason = ifelse(design == 0, 'no_design', fail_reason),
fail_reason = ifelse(primer_found == 1, 'succesful_design', fail_reason)) %>%
mutate(count_group = ifelse(BSJ_count > 4, 'count ≥ 5', 'count < 5'),
count_group = ifelse(tool == 'Sailfish-cir', 'no_counts', count_group)) %>%
ggplot(aes(tool, fill = fail_reason)) +
scale_fill_manual(values = c("#E69F00", '#CC79A7', '#00B9F2', "#009E73")) +
geom_bar(position = 'fill') +
facet_grid(~count_group, scales = 'free', space = 'free') +
scale_y_continuous(labels = scales::percent_format()) +
mytheme_discrete_x +
xlab('') +
ylab('% of circRNAs') +
theme(legend.position = 'right')#ggsave('sup_figures/sup_figure_12.pdf', width = 20, height = 10, units = "cm")
# numbers
primer_design %>%
filter(primer_found == 0) %>%
mutate(fail_reason = ifelse(total_primer_pairs == failed_spec, 'failed specificity', NA),
fail_reason = ifelse(total_primer_pairs == failed_SNP, 'failed SNPs', fail_reason),
fail_reason = ifelse(total_primer_pairs == failed_sec_str_amp, 'failed sec str amp', fail_reason),
fail_reason = ifelse(total_primer_pairs == failed_sec_str_temp, 'failed sec str temp', fail_reason),
fail_reason = ifelse(is.na(fail_reason), 'failed sec str amp', fail_reason),
fail_reason = ifelse(design == 0, 'no_design', fail_reason),
fail_reason = ifelse(primer_found == 1, 'succesful_design', fail_reason)) %>%
group_by(tool) %>%
summarise(perc_ok = sum(fail_reason == 'failed specificity')/n()) %>%
pull(perc_ok) %>% quantile()## 0% 25% 50% 75% 100%
## 0.2929293 0.3508493 0.3820842 0.4216270 0.5781893
cq %>%
filter(!tool == 'Sailfish-cir') %>%
ggplot(aes(tool, BSJ_count)) +
geom_boxplot() +
#geom_point(size = 0.5) +
#geom_jitter() +
#facet_wrap(~count_group, scales = 'free') +
mytheme_discrete_x +
scale_y_log10() +
ylab('BSJ count') +
xlab('')see illustator
val_cum = cq %>%
select(tool, circ_id, cell_line, BSJ_count, qPCR_val, RR_val, amp_seq_val, compound_val) %>%
group_by(tool, BSJ_count) %>%
summarise(total_nr_qPCR = n(),
val_nr_qPCR = sum(qPCR_val == 'pass'),
total_nr_RR = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
val_nr_RR = sum(RR_val == 'pass', na.rm = T),
total_nr_amp = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included' => same for compound val
val_nr_amp = sum(amp_seq_val == 'pass', na.rm = T),
val_nr_compound = sum(compound_val == 'pass', na.rm = T)) %>%
ungroup() %>%
group_by(tool) %>%
arrange(desc(BSJ_count)) %>%
mutate(val_perc_cum_qPCR = cumsum(val_nr_qPCR)/cumsum(total_nr_qPCR),
val_perc_cum_RR = cumsum(val_nr_RR)/cumsum(total_nr_RR),
val_perc_cum_amp = cumsum(val_nr_amp)/cumsum(total_nr_amp),
val_perc_cum_compound = cumsum(val_nr_compound)/cumsum(total_nr_amp)) %>%
ungroup()## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
# pivot longer to get all validation methods in one column
val_cum_long = val_cum %>%
pivot_longer(cols = c(val_perc_cum_qPCR, val_perc_cum_RR,
val_perc_cum_amp, val_perc_cum_compound),
values_to = 'cum_precision', names_to = "type")
val_cum_longval_cum_long$type = factor(val_cum_long$type, levels = c('val_perc_cum_qPCR', 'val_perc_cum_RR', 'val_perc_cum_amp', 'val_perc_cum_compound'))
val_cum_long %>%
ggplot(aes(BSJ_count, cum_precision, color = type)) +
geom_point(size = 0.8) +
geom_line() +
facet_wrap(~tool, scales = 'free_x') +
scale_x_log10() +
scale_y_continuous(labels = scales::percent_format()) +
mytheme +
theme(legend.position = 'right') +
labs(color=NULL) +
scale_color_manual(values=c("#009E73", "#F0E442", "#0072B2", '#D55E00'),
labels = c('cumulative qPCR precision', 'cumulative RNase R precision',
'cumulative amp seq precision', 'cumulative compound precision')) +
xlab('BSJ count') +
ylab('cumulative precision')## Warning: Removed 10 rows containing missing values (`geom_point()`).
see illustrator
treatment$enrichment_bin = factor(treatment$enrichment_bin,
levels = c('count treated NA', 'not enriched', 'enriched'))
treatment$tool = factor(treatment$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
treatment %>%
filter(!count_group == 'count < 5') %>%
ggplot(aes(tool, fill = enrichment_bin)) +
geom_bar(position = 'fill') +
mytheme_discrete_x +
scale_y_continuous(labels = scales::percent_format()) +
theme(legend.position = 'right') +
scale_fill_manual(values = c('#CC79A7', '#D55E00', '#009E73'), name = '') +
xlab('') +
ylab('% of circRNAs')treatment %>%
filter(count_UT >= 5, !is.na(count_UT), !is.na(count_T)) %>%
filter(!tool == "Sailfish-cir") %>%
ggplot(aes(tool, enrichment)) +
geom_violin(draw_quantiles = c(0.5)) +
geom_hline(yintercept = 1, color = '#999999', linetype = "dashed") +
geom_text(
data = treatment %>%
filter(count_UT >= 5, !is.na(count_UT), !is.na(count_T)) %>%
filter(!tool == "Sailfish-cir") %>%
group_by(tool) %>%
summarise(enrich_median = median(enrichment),
enrich_median_place = enrich_median + 1),
mapping = aes(tool, enrich_median_place, label = sprintf("%0.2f", round(enrich_median, digits = 2)))) +
scale_y_log10(labels = scales::comma_format()) +
xlab('') +
ylab('CPM enrichment after RNase R treatment (treated CPM / untreated CPM)') +
mytheme_discrete_xtreatment %>%
filter(!count_group == 'count < 5',
!is.na(count_UT),
!is.na(count_T))%>%
#filter(!tool == 'Sailfish-cir') %>%
ggplot(aes(enrichment, color = tool)) +
stat_ecdf() +
scale_x_log10(labels = scales::comma_format(), limits = c(0.001, 1000)) +
scale_y_continuous(label = scales::percent_format()) +
geom_vline(xintercept = 1, color = '#999999', linetype = "dashed") +
ylab('cumulative % of circRNAs') +
xlab('enrichment factor') +
mytheme +
theme(legend.position = 'right')## Warning: Removed 292 rows containing non-finite values (`stat_ecdf()`).
map_perc_cum_tool = cq %>%
select(circ_id_strand, tool, perc_on_target, count_group) %>%
unique() %>%
filter(!is.na(perc_on_target)) %>%
group_by(tool, count_group) %>%
mutate(total_n = n()) %>%
arrange(desc(perc_on_target)) %>%
mutate(n_cum = 1:n(),
n_cum_perc = n_cum/total_n)
map_perc_cum_toolplot
map_perc_cum_tool$tool = factor(map_perc_cum_tool$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl", "Sailfish-cir"))
map_perc_cum_tool %>%
ggplot(aes(perc_on_target, n_cum_perc, color = count_group)) +
geom_line() +
xlab('minimum % on-target amplification') +
ylab('% of circRNAs') +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
mytheme +
facet_wrap(~tool) +
scale_color_manual(values=c( '#56B4E9', '#E69F00', '#999999')) +
theme(legend.position = 'right') +
geom_vline(xintercept = 0.5, linetype = 'dashed', color = '#999999')make one validation label for each circ (that combines the 3 val methods) this will be used later for ordering and make the dataframe longer
hm = cq %>%
select(tool, circ_id, count_group, qPCR_val, RR_val, amp_seq_val) %>%
mutate(all_val = paste(qPCR_val, RR_val, amp_seq_val, sep = '_')) %>%
pivot_longer(cols = c(qPCR_val, RR_val, amp_seq_val),
values_to = 'val', names_to = "val_type")
hmadd 3 * 20 empty values for the 2 tools that don’t report circRNA with BSJ count < 5 (this would probably have been more efficient to do before the pivot_longer)
hm = hm %>%
bind_rows(tibble(tool = "circRNA_finder", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "qPCR_val")) %>%
bind_rows(tibble(tool = "segemehl", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "qPCR_val")) %>%
bind_rows(tibble(tool = "circRNA_finder", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "RR_val")) %>%
bind_rows(tibble(tool = "segemehl", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "RR_val")) %>%
bind_rows(tibble(tool = "circRNA_finder", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "amp_seq_val")) %>%
bind_rows(tibble(tool = "segemehl", circ_id = paste('test', 1:20), count_group = 'count < 5', val = 'NANANA',
val_type = "amp_seq_val"))
hmadd one extra empty line for the tools that do report BSJ count < 5 that can then separate both groups
for (tool_name in cq %>% #filter(count_group == 'count < 5') %>%
pull(tool) %>% unique()) {
hm = hm %>%
bind_rows(tibble(tool = tool_name, circ_id = 'extra_line', count_group = 'extra_line',
val = "NANANA", val_type = "qPCR_val")) %>%
bind_rows(tibble(tool = tool_name, circ_id = 'extra_line', count_group = 'extra_line',
val = "NANANA", val_type = "RR_val")) %>%
bind_rows(tibble(tool = tool_name, circ_id = 'extra_line', count_group = 'extra_line',
val = "NANANA", val_type = "amp_seq_val"))
}
hmcheck if every tool has the same number of lines
change label of specific line sailfish-cir (does not report counts)
hm = hm %>%
mutate(count_group = ifelse(tool == 'sailfish-cir' & count_group == "extra_line",
'extra_line2', count_group))set everything as factors in the right order
hm$val_type = factor(hm$val_type, levels = c("qPCR_val", 'RR_val', 'amp_seq_val')) #"pass_NA_NA",
hm$all_val = factor(hm$all_val, levels = c("fail_fail_fail", "fail_fail_NA", "fail_fail_pass", "pass_fail_fail", "pass_fail_NA", "pass_NA_fail",
"pass_NA_NA", "pass_fail_pass", "pass_pass_fail","pass_NA_pass", "pass_pass_NA", "pass_pass_pass"))
hm$count_group = factor(hm$count_group, levels = c('count ≥ 5', "extra_line",'count < 5', 'no_counts', 'extra_line2'))
hm$tool = factor(hm$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl", "Sailfish-cir")) order per tool and per count group all circ according to their validation (all_val) and add a number to each circRNA
hm = hm %>%
group_by(tool) %>%
arrange(count_group, all_val) %>%
mutate(nr = row_number()) %>%
ungroup() %>%
# to have only one number per circ (instead of 3), take the mean (this step can be skipped and gives same results, but I though it might solve the random points)
group_by(circ_id, tool) %>%
mutate(nr = mean(nr)) %>%
ungroup()
hm$nr = factor(hm$nr,
levels = c("2", "5", "8", "11", "14", "17", "20", "23", "26", "29",
"32", "35", "38", "41", "44", "47", "50", "53", "56",
"59", "62", "65", "68", "71", "74", "77", "80", "83",
"86", "89", "92", "95","98", "101", "104", "107", "110",
"113", "116", "119", "122", "125", "128", "131", "134",
"137", "140", "143", "146", "149", "152", "155", "158",
"161", "164", "167", "170", "173", "176", "179", "182",
"185", "188", "191", "194", "197", "200", "203", "206",
"209", "212", "215", "218", "221", "224", "227", "230",
"233", "236", "239", "242", "245", "248", "251", "254",
"257", "260", "263", "266", "269", "272", "275", "278",
"281", "284", "287", "290", "293", "296", "299", "302",
"264", "265", "267", "268", "270", "271", "273", "274",
"276", "277", "279", "280", "282", "283"))plot the whole thing
hm %>%
ggplot(aes(val_type, nr)) +
geom_tile(aes(fill = val), colour = "white") +
scale_fill_manual(values=c('#CC79A7', "#ffffff", '#009E73', '#999999')) +
facet_wrap(~tool, ncol = 16, scales = 'free_y') +
#facet_grid(rows = vars(count_group), cols = vars(tool), scales = 'free', space = 'free') +
mytheme_discrete_x +
ylab('') +
xlab('') +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(),
axis.line.y = element_blank(),
legend.position = "right")get number for figure 21
cq %>%
group_by(tool, count_group) %>%
summarise(qPCR_P = 100*sum(qPCR_val == 'pass')/n(),
qPCR_F = 100*sum(qPCR_val == 'fail')/n(),
RR_P = 100*sum(RR_val == 'pass', na.rm = T)/n(),
RR_F = 100*sum(RR_val == 'fail', na.rm = T)/n(),
RR_NA = 100*sum(is.na(RR_val))/n(),
amp_P = 100*sum(amp_seq_val == 'pass', na.rm = T)/n(),
amp_F = 100*sum(amp_seq_val == 'fail', na.rm = T)/n(),
amp_NA = 100*sum(is.na(amp_seq_val))/n()) %>%
ungroup() %>% view()## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
cq %>%
filter(!is.na(amp_seq_val)) %>%
mutate(val_summary = ifelse(qPCR_val == 'pass', 1, 0),
RR_val = ifelse(is.na(RR_val), 'NA', RR_val),
val_summary = ifelse(RR_val == 'pass', val_summary + 1, val_summary),
amp_seq_val = ifelse(is.na(amp_seq_val), 'NA', amp_seq_val),
val_summary = ifelse(amp_seq_val == 'pass', val_summary + 1, val_summary),
val_summary = paste(as.character(val_summary), ' methods', sep = '')) %>%
ggplot(aes(tool, fill = val_summary)) +
geom_bar(position = 'fill') +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c('#CC79A7', "#D36027", '#0073B2', "#009E73")) +
facet_grid(~count_group, scales = 'free', space = 'free') +
xlab('') +
ylab('percentage of circRNAs') +
labs(fill="validated by") +
mytheme_discrete_x +
theme(legend.position = "right")val %>%
select(tool, count_group, nr_compound_total, perc_compound_val) %>%
mutate(margin = qnorm(0.975)*sqrt(perc_compound_val*(1-perc_compound_val)/nr_compound_total),
CI_low = perc_compound_val - margin,
CI_up = perc_compound_val + margin) %>%
ggplot(aes(tool, perc_compound_val, fill = count_group)) +
geom_bar(stat = 'identity') +
mytheme_discrete_x +
facet_grid(~count_group, scales = 'free_x', space = 'free') +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c('#00B9F2', '#E69F00' , '#999999')) +
geom_errorbar(aes(ymin=CI_low, ymax=CI_up),
width=.3, color = 'grey45') +
xlab('') +
ylab('compound precision metric (+/- 95% CI)') +
theme(legend.position = "")sens %>%
mutate(margin = qnorm(0.975)*sqrt(sensitivity*(1-sensitivity)/949), #949 is total nr of val circ
CI_low = sensitivity - margin,
CI_up = sensitivity + margin) %>%
ggplot(aes(tool, sensitivity, fill = count_group_median)) +
geom_bar(stat = 'identity') +
scale_y_continuous(labels = scales::percent_format()) +
mytheme_discrete_x +
geom_errorbar(aes(ymin=CI_low, ymax=CI_up),
width=.3, color = 'grey45') +
facet_wrap(~count_group_median) +
scale_fill_manual(values = c('#00B9F2', '#E69F00')) +
xlab('') +
theme(legend.position = '')get the set of validated circRNAs => 949
sens_set = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, BSJ_count_median) %>% unique()
sens_setget cumulative nr of circ you expect at the BSJ_count_median (no tool info)
sens_set_nr = sens_set %>%
group_by(BSJ_count_median) %>%
summarise(total_nr_detected = n()) %>%
arrange(desc(BSJ_count_median)) %>%
mutate(cum_total_nr = cumsum(total_nr_detected))
sens_set_nrcalculate the cumulative sensitivity per BSJ count
sens_cum = sens_set %>%
# check witch tools have detected these
left_join(all_circ %>%
select(tool, circ_id, cell_line) %>% unique()) %>%
# add number that is expected to be detected in that BSJ_count_median group
# count number that is actually detected for each tool
group_by(tool, BSJ_count_median) %>%
summarise(nr_detected = n()) %>%
left_join(sens_set_nr) %>%
group_by(tool) %>%
arrange(desc(BSJ_count_median)) %>%
mutate(cum_nr = cumsum(nr_detected),
cum_sens = cum_nr/cum_total_nr) %>%
ungroup()## Joining with `by = join_by(circ_id, cell_line)`
## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(BSJ_count_median)`
sens_cum %>%
ggplot(aes(BSJ_count_median, cum_sens)) +
geom_point(size = 0.8, color = "#CC79A7") +
geom_line(color = "#CC79A7") +
facet_wrap(~tool, scales = 'free_x') +
scale_x_log10() +
scale_y_continuous(labels = scales::percent_format()) +
mytheme +
theme(legend.position = 'right') +
xlab('(median) BSJ count') +
ylab('cumulative sensitivity')see script panel 3
prec_recall = val %>%
filter(!count_group == "count < 5") %>%
select(tool, perc_compound_val) %>%
full_join(sens %>% filter(count_group_median == 'count ≥ 5') %>% select(tool, sensitivity)) ## Joining with `by = join_by(tool)`
prec_recall$tool = factor(prec_recall$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
prec_recall %>%
ggplot(aes(sensitivity, perc_compound_val, color = tool, label = tool)) +
geom_point() +
ylab('compound precision (BSJ count ≥ 5)') +
xlab('sensitivity (median BSJ count ≥ 5)') +
scale_x_continuous(labels = scales::percent_format(), limits = c(0,1)) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,1)) +
coord_fixed() +
geom_text_repel(max.overlaps = 20) +
mytheme +
theme(legend.position = '') +
geom_abline(intercept = 0, slope = 1, color = '#999999', linetype = "dashed")recalculate the precision per cell line
val_cl = cq %>%
select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
group_by(tool, count_group, cell_line) %>%
summarise(nr_qPCR_total = n(),
nr_qPCR_fail = sum(qPCR_val == 'fail'),
nr_qPCR_val = sum(qPCR_val == 'pass'),
nr_RR_total = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
nr_RR_fail = sum(RR_val == "fail", na.rm = T),
nr_RR_val = sum(RR_val == "pass", na.rm = T),
nr_amp_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included
nr_compound_fail = sum(compound_val == "fail", na.rm = T),
nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
perc_RR_val = nr_RR_val/nr_RR_total,
perc_amp_val = nr_amp_val/nr_amp_total,
perc_compound_val = nr_compound_val/nr_compound_total) %>%
ungroup()## `summarise()` has grouped output by 'tool', 'count_group'. You can override
## using the `.groups` argument.
val_cl_long = val_cl %>%
pivot_longer(cols = c(perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val),
names_to = 'type', values_to = 'precision')
val_cl_long$type = factor(val_cl_long$type, levels = c('perc_qPCR_val', 'perc_RR_val', 'perc_amp_val', 'perc_compound_val'))
type.labs = c('qPCR precision', 'RNase R precision', 'amplicon sequencing precision', 'compound precision')
names(type.labs) = c('perc_qPCR_val', 'perc_RR_val', 'perc_amp_val', 'perc_compound_val')
val_cl_long %>%
filter(!count_group == "count < 5") %>%
ggplot(aes(cell_line, precision, color = tool, group = tool)) +
geom_point() +
geom_line() +
mytheme_discrete_x +
theme(legend.position = 'right') +
facet_wrap(~type, labeller = labeller(type = type.labs)) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,1)) +
ylab('precision') +
xlab('cell line')first calculate the total nr of validated circ per count group
nr_val_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
group_by(count_group_median, cell_line) %>%
summarise(nr_expected = n())## `summarise()` has grouped output by 'count_group_median'. You can override
## using the `.groups` argument.
then calculate sensitivity by dividing nr of circ found by total
sens_cl = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
# check witch tools have detected these
left_join(all_circ %>%
select(tool, circ_id, cell_line) %>% unique()) %>%
group_by(tool, count_group_median, cell_line) %>%
summarise(nr_detected = n()) %>%
left_join(nr_val_cl) %>%
mutate(sensitivity = nr_detected/nr_expected) %>%
ungroup()## Joining with `by = join_by(circ_id, cell_line)`
## `summarise()` has grouped output by 'tool', 'count_group_median'. You can
## override using the `.groups` argument.
## Joining with `by = join_by(count_group_median, cell_line)`
sens_cl %>%
filter(!count_group_median == "count < 5") %>%
ggplot(aes(cell_line, sensitivity, color = tool, group = tool)) +
geom_point() +
geom_line() +
mytheme_discrete_x +
theme(legend.position = 'right') +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,1)) +
xlab('cell line')number of circRNAs measured in more than one cell line
circ_cl = cq %>% inner_join(cq %>%
select(circ_id, cell_line) %>%
group_by(circ_id) %>%
unique() %>%
filter(n() > 1) %>%
select(circ_id) %>% unique()) %>%
select(circ_id, qPCR_val, qPCR_val_detail, RR_val,
RR_val_detail, amp_seq_val, amp_seq_val_detail) %>%
unique() %>%
#group_by(circ_id) %>% filter(n() == 1)
select(circ_id) %>% unique()## Joining with `by = join_by(circ_id)`
fail_circ = cq %>% select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>% unique() %>%
inner_join(circ_cl) %>%
select(-cell_line) %>% unique() %>% group_by(circ_id) %>% filter(n() > 1)## Joining with `by = join_by(circ_id)`
fail_circ = fail_circ %>%
filter(!amp_seq_val == 'fail',
!is.na(RR_val),
!is.na(amp_seq_val),
RR_val == 'fail') %>%
select(circ_id) %>% unique()
fail_circ## [1] 0.919
## [1] 0.081
# plot all Cq values
circ_cl = cq %>% inner_join(circ_cl) %>%
select(circ_id, cell_line, Cq_min_untreated, Cq_max_untreated) %>% unique() %>%
mutate(Cq_mean = (Cq_min_untreated + Cq_max_untreated)/2) %>%
select(-Cq_min_untreated, -Cq_max_untreated)## Joining with `by = join_by(circ_id)`
circ_cl = circ_cl %>%
# between HLF and NCI-H23
filter(cell_line == "HLF") %>%
rename(cell_line_1 = cell_line, Cq_mean_1 = Cq_mean) %>%
inner_join(circ_cl %>%
filter(cell_line == "NCI-H23") %>%
rename(cell_line_2 = cell_line, Cq_mean_2 = Cq_mean)) %>%
bind_rows(circ_cl %>%
# between HLF and SW480
filter(cell_line == "HLF") %>%
rename(cell_line_1 = cell_line, Cq_mean_1 = Cq_mean) %>%
inner_join(circ_cl %>%
filter(cell_line == "SW480") %>%
rename(cell_line_2 = cell_line, Cq_mean_2 = Cq_mean))) %>%
bind_rows(circ_cl %>%
# between NCI-H23 and SW480
filter(cell_line == "NCI-H23") %>%
rename(cell_line_1 = cell_line, Cq_mean_1 = Cq_mean) %>%
inner_join(circ_cl %>%
filter(cell_line == "SW480") %>%
rename(cell_line_2 = cell_line, Cq_mean_2 = Cq_mean)))## Joining with `by = join_by(circ_id)`
## Joining with `by = join_by(circ_id)`
## Joining with `by = join_by(circ_id)`
circ_cl %>%
ggplot(aes(Cq_mean_1, Cq_mean_2)) +
geom_point() +
geom_smooth(method = "lm", color = '#00B9F2') +
facet_wrap(~cell_line_1+cell_line_2, scales = 'free') +
xlim(17, 32) +
ylim(17, 32) +
stat_regline_equation(label.y = 32, label.x=17, aes(label = ..eq.label..)) +
stat_regline_equation(label.y = 30, label.x=17, aes(label = ..rr.label..)) +
xlab('Cq') +
ylab('Cq') +
#geom_abline(color = "#00B9F2") +
mytheme +
theme(aspect.ratio = 1) ## Warning: The dot-dot notation (`..eq.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(eq.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing non-finite values (`stat_regline_equation()`).
## Removed 1 rows containing non-finite values (`stat_regline_equation()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
plot those 12 circ => sup figure 31
cq_4 = cq %>%
select(circ_id, cell_line, Cq_max_untreated, Cq_min_treated) %>%
inner_join(fail_circ) %>%
pivot_longer(cols = c(Cq_max_untreated, Cq_min_treated), values_to = 'Cq', names_to = 'sample') %>%
mutate(Cq = as.numeric(Cq)) ## Joining with `by = join_by(circ_id)`
cq_4$circ_id = factor(cq_4$circ_id, levels = c('chr5:618989-655584', 'chr5:36982164-36986301','chr5:177209635-177212195', 'chr10:96155325-96160402'))
cq_4 %>%
ggplot(aes(sample, Cq, color = cell_line)) +
geom_point() +
scale_color_manual(name = "",
values = c("#0072B2", "#E69F00", "#CC79A7")) +
geom_line(aes(group = cell_line)) +
facet_grid(~circ_id) +
mytheme_discrete_x +
xlab('')count_Cq = all_circ %>%
inner_join(cq %>%
select(circ_id, Cq_min_untreated, Cq_max_untreated) %>%
unique() %>%
mutate(Cq_mean = rowMeans(select(.,c(Cq_min_untreated, Cq_max_untreated)), na.rm = T)) %>%
filter(!is.na(Cq_mean)))## Joining with `by = join_by(circ_id)`
## Warning in inner_join(., cq %>% select(circ_id, Cq_min_untreated, Cq_max_untreated) %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 331 of `x` matches multiple rows in `y`.
## ℹ Row 466 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## [1] 3240
count_Cq %>%
filter(!tool == "Sailfish-cir") %>%
ggplot(aes(log2(BSJ_count), Cq_mean)) +
#geom_density(adjust = 2, stat = 'identity') +
geom_point(size = 0.5, alpha = 0.2, fill=alpha("black", 0.2)) +
#geom_density(stat = 'identity') +
geom_smooth(method = "lm", color = '#00B9F2') +
facet_wrap(~tool, scales = 'free') +
ylab('mean of Cq values (qPCR replicates)') +
xlab('log2(BSJ count)') +
#stat_regline_equation(label.y = 37, label.x=6, aes(label = ..eq.label..)) +
#stat_regline_equation(label.y = 40, label.x=6, aes(label = ..rr.label..)) +
#stat_regline_equation(label.y = 42, label.x=6, aes(label = ..p.label..)) +
stat_regline_equation(label.x = 0, label.y = 10, size = 3)+
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "*`,`~")),
label.y= 5, label.x = 0, size = 3) +
mytheme +
theme(aspect.ratio = 1) +
xlim(0,11) +
#xlim(-25,17) +
ylim(0,45)## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 7 rows containing non-finite values
## (`stat_regline_equation()`).
## Warning: Removed 7 rows containing non-finite values (`stat_cor()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
val_db = cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, n_db, cell_line) %>% unique() %>%
pivot_longer(cols = c(qPCR_val, RR_val, amp_seq_val), names_to = "val_type", values_to = "val") %>%
mutate(n_db = ifelse(is.na(n_db), 0, n_db))
val_dbval_db$val_type = factor(val_db$val_type, levels = c('qPCR_val', 'RR_val', 'amp_seq_val'))
val_db %>%
ggplot(aes(n_db, fill = val)) +
geom_bar() +
facet_grid(~val_type) +
mytheme +
scale_fill_manual(values = c('#CC79A7', '#00A875' ,'#999999')) +
scale_x_continuous(breaks=c(0,5,10,15)) +
ylab('number of circRNAs') +
xlab('number of databases in which the circRNAs is reported')cq_db = cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, compound_val, dbs) %>%
unique() %>%
separate(dbs, sep = '/',
into = c('db_1', 'db_2', 'db_3', 'db_4', 'db_5', 'db_6',
'db_7', 'db_8', 'db_9', 'db_10', 'db_11', 'db_12')) %>%
pivot_longer(cols = c('db_1', 'db_2', 'db_3', 'db_4', 'db_5', 'db_6',
'db_7', 'db_8', 'db_9', 'db_10', 'db_11', 'db_12'),
names_to = 'db_nr',
values_to = 'db') %>%
filter(!is.na(db)) %>%
select(-db_nr)## Warning: Expected 12 pieces. Missing pieces filled with `NA` in 1283 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
cq_db_prec = cq_db %>%
group_by(db) %>%
summarise(nr_qPCR_total = n(),
nr_qPCR_fail = sum(qPCR_val == 'fail'),
nr_qPCR_val = sum(qPCR_val == 'pass'),
nr_RR_total = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
nr_RR_fail = sum(RR_val == "fail", na.rm = T),
nr_RR_val = sum(RR_val == "pass", na.rm = T),
nr_amp_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_compound_fail = sum(compound_val == "fail", na.rm = T),
nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
perc_RR_val = nr_RR_val/nr_RR_total,
perc_amp_val = nr_amp_val/nr_amp_total,
perc_compound_val = nr_compound_val/nr_compound_total) %>%
ungroup()
cq_db_preccq_db_prec = cq_db_prec %>%
select(db, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val) %>%
pivot_longer(cols = c(-db), values_to = "perc", names_to = "val_type") %>%
separate(val_type, into = c('tmp', 'val_type', 'tmp2')) %>%
select(-tmp, -tmp2) %>% unique() %>%
full_join(cq_db_prec %>%
select(db, nr_qPCR_total, nr_RR_total, nr_amp_total, nr_compound_total) %>%
pivot_longer(cols = c(-db), values_to = "total_nr", names_to = "val_type") %>%
separate(val_type, into = c('tmp', 'val_type', 'tmp2')) %>%
select(-tmp, -tmp2)) %>%
mutate(margin = qnorm(0.975)*sqrt(perc*(1-perc)/total_nr),
CI_low = perc - margin,
CI_up = perc + margin) ## Joining with `by = join_by(db, val_type)`
cq_db_prec$val_type = factor(cq_db_prec$val_type, levels = c('qPCR', 'RR', 'amp', 'compound'))
cq_db_prec %>%
ggplot(aes(db, perc, fill = val_type)) +
geom_bar(stat = 'identity') +
geom_errorbar(aes(ymin=CI_low, ymax=CI_up),
width=.3, color = 'grey45') +
facet_wrap(~val_type, scales = 'free_x') +
scale_y_continuous(labels = scales::percent_format(), breaks = c(0,0.25, 0.5, 0.75, 1)) +
mytheme_discrete_x +
#theme(strip.text.y = element_text(size = 10), legend.position = "none") +
theme(legend.position = "none") +
scale_fill_manual(values = c('#009E73', '#F0E442' , '#0073B2', '#D36027')) +
geom_text(aes(label = total_nr, y = 0.1), colour = "black", size = 3) +
xlab('') +
ylab('precision (and number of observations)')cq_db %>%
group_by(db) %>%
inner_join(cq %>% filter(compound_val == "pass") %>%
select(circ_id) %>% unique()) %>%
summarise(nr_circ_detected = n(),
sens_db = n()/914) %>%
ggplot(aes(db, sens_db)) +
geom_bar(stat = 'identity', fill = "#5AB4E5") +
mytheme_discrete_x +
scale_y_continuous(label = scales::percent_format()) +
xlab('') +
ylab('sensitivity')## Joining with `by = join_by(circ_id)`
cq %>%
filter(!is.na(amp_seq_val)) %>%
mutate(val_summary = ifelse(qPCR_val == 'pass', 1, 0),
RR_val = ifelse(is.na(RR_val), 'NA', RR_val),
val_summary = ifelse(RR_val == 'pass', val_summary + 1, val_summary),
amp_seq_val = ifelse(is.na(amp_seq_val), 'NA', amp_seq_val),
val_summary = ifelse(amp_seq_val == 'pass', val_summary + 1, val_summary),
val_summary = paste(as.character(val_summary), ' methods', sep = '')) %>%
ggplot(aes(n_detected, fill = val_summary)) +
geom_bar() +
#scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c('#CC79A7', "#D36027", '#0073B2', "#009E73")) +
facet_grid(~count_group_median, space = 'free') +
xlab('number of tools by which the circRNA is detected') +
ylab('number of circRNAs') +
labs(fill="validated by") +
mytheme_discrete_x +
theme(legend.position = "right")see panel 4 script
all possible combo’s, only circRNAs ≥ 5
## Rows: 675 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (9): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 9894 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): tool_1, tool_2, tool_3, cell_line, count_group, tool_lt_1, tool_lt...
## dbl (10): nr_union, perc_compound_val_1, perc_compound_val_2, perc_compound_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
make subset
union_sub_3 = simple_union_3 %>%
#filter(nr_union > 4000) %>%
filter(nr_union > 8000) %>% # is 70% of HLF : 0.7*13,087
group_by(cell_line) %>%
#filter(nr_union - pmax(total_n_1, total_n_2, total_n_3) > 2499) %>%
ungroup() %>%
filter(perc_compound_val_1 > 0.9,
perc_compound_val_2 > 0.9,
perc_compound_val_3 > 0.9)
union_sub_3add individual tools of interest
# union_sub_3 = union_sub_3 %>%
# bind_rows(simple_union_3 %>%
# filter(tool_1 == tool_2 & tool_1 == tool_3,
# tool_1 %in% (union_sub_3 %>% pull(tool_1, tool_2) %>% unique())))remove doubles
give combo nr and add info
union_sub_3 = union_sub_3 %>%
mutate(combo = paste("combo", 1:16, sep = "_"))
union_sub_3 = union_sub_3 %>%
bind_rows(simple_union_3 %>%
filter(nchar(tool_combo) == 1) %>%
unique() %>%
right_join(union_sub_3 %>%
select(tool_lt_1, tool_lt_2, tool_lt_3, combo) %>%
pivot_longer(cols = -combo, names_to = 'tmp', values_to = 'tool_lt_1') %>%
select(-tmp)))## Joining with `by = join_by(tool_lt_1)`
## Warning in right_join(., union_sub_3 %>% select(tool_lt_1, tool_lt_2, tool_lt_3, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 34 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
plot
union_sub_3 %>%
filter(cell_line == 'HLF') %>%
left_join(all_circ %>% select(circ_id, cell_line, count_group) %>%
filter(count_group == "count ≥ 5") %>%
unique() %>% count(cell_line) %>% rename(total_n = n)) %>%
mutate(perc_union = nr_union/total_n) %>%
ggplot(aes(w_val_rate, perc_union)) +
geom_point(aes(color = (tool_1 == tool_2 & tool_2 == tool_3))) +
geom_text_repel(aes(label=tool_combo, color = (tool_1 == tool_2 & tool_2 == tool_3)),
max.overlaps = 20) +
facet_wrap(~combo, scales = 'free') +
mytheme +
theme(legend.position = 'NA') +
scale_color_manual(values = c('#CC79A7', '#0072B2')) +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
xlab('validation rate') +
ylab('number of circRNAs')## Joining with `by = join_by(cell_line)`
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing missing values (`geom_text_repel()`).
union_sub_3 = union_sub_3 %>%
bind_rows(simple_union %>%
right_join(union_sub_3 %>% select(tool_combo, combo) %>%
filter(nchar(tool_combo) == 3) %>%
mutate(tool_combo1 = paste(substr(tool_combo, 1, 1), substr(tool_combo, 2, 2), sep = ''),
tool_combo2 = paste(substr(tool_combo, 1, 1), substr(tool_combo, 3, 3), sep = ''),
tool_combo3 = paste(substr(tool_combo, 2, 2), substr(tool_combo, 3, 3), sep = '')) %>%
select(combo, tool_combo1, tool_combo2, tool_combo3) %>%
pivot_longer(cols = c(tool_combo1, tool_combo2, tool_combo3), names_to = "tmp", values_to = 'tool_combo') %>%
select(-tmp) %>%
unique()))## Joining with `by = join_by(tool_combo)`
## Warning in right_join(., union_sub_3 %>% select(tool_combo, combo) %>% filter(nchar(tool_combo) == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 16 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
union_sub_3 = union_sub_3 %>%
group_by(combo, tool_combo, cell_line) %>%
sample_n(1)
union_sub_3 %>%
filter(cell_line == 'HLF') %>%
left_join(all_circ %>% select(circ_id, cell_line, count_group) %>%
filter(count_group == "count ≥ 5") %>%
unique() %>% count(cell_line) %>% rename(total_n = n)) %>%
mutate(perc_union = nr_union/total_n) %>%
ggplot(aes(w_val_rate, perc_union)) +
geom_point(aes(color = as.character(nchar(tool_combo)))) +
geom_text_repel(aes(label=tool_combo, color = as.character(nchar(tool_combo))),
max.overlaps = 20) +
facet_wrap(~combo, scales = 'free') +
mytheme +
theme(legend.position = 'NA') +
scale_color_manual(values = c('#CC79A7', '#E69F00', '#0072B2')) +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
xlab('(weighted) compound precision value') +
ylab('percentage of all predicted circRNAs')## Joining with `by = join_by(cell_line)`
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
check mean increase in perc
simple_union_3 %>%
filter(!(tool_1 == tool_2 & tool_1 == tool_3),
!tool_1 == tool_2,
!tool_2 == tool_3,
!tool_1 == tool_3,
perc_compound_val_1 >= 0.9,
perc_compound_val_2 >= 0.9,
perc_compound_val_3 >= 0.9) %>%
pull(perc_increase_t1) %>%
quantile()## 0% 25% 50% 75% 100%
## 0.003460873 0.223103785 0.617183889 1.679791211 19.710914454
sim_data = read_tsv('../data/details/sim_data_PMID_28594838.txt') %>%
select(tool, dataset, sensitivity, precision) %>%
mutate(paper = 'PMID 28594838') %>%
bind_rows(read_tsv('../data/details/sim_data_PMID_34645386.txt') %>%
select(tool, dataset, sensitivity, precision) %>%
mutate(paper = 'PMID 34645386'))## Rows: 22 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): tool_original, dataset, tool
## dbl (6): nr_detected, TP, sensitivity, precision, F1, AUC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 13 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): dataset, tool_original, tool
## dbl (5): nr_detected, nr_FP, nr_TP, sensitivity, precision
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sim_data$dataset = factor(sim_data$dataset, levels = c('pos', 'mixed', 'mix1', 'mix2', 'mix3'))
sim_data$paper = factor(sim_data$paper, levels = c('PMID 34645386', 'PMID 28594838'))
dataset.labs = c('mixed dataset (circBase + RefSeq)', 'positive dataset (circBase)', 'mixed dataset (circBase + Salmon)', 'mixed dataset + tandem RNAs', 'mixed dataset (CIRI-simulator and ART simulator)')
names(dataset.labs) = c('mixed', 'pos', 'mix1', 'mix2', 'mix3')
sim_data %>%
rename(precision_sim = precision) %>%
inner_join(val %>% filter(!count_group == "count < 5") %>%
select(tool, perc_compound_val)) %>%
ggplot(aes(precision_sim, perc_compound_val, color = tool)) +
geom_point() +
facet_wrap(~paper+dataset, labeller = labeller(dataset = dataset.labs), scales = 'free') +
mytheme +
theme(legend.position = 'right') +
geom_abline(intercept = 0, slope = 1, color = '#999999', linetype = "dashed") +
scale_x_continuous(labels = scales::percent_format(), limits = c(0,1)) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,1)) +
xlab('precision based on simulated data (PMID 28594838 and 34645386)') +
ylab('precision based on orthogonal validation (compound precision from this manuscript)')## Joining with `by = join_by(tool)`
sim_data_sens = sim_data %>%
rename(sensitivity_sim = sensitivity) %>%
inner_join(sens %>% filter(!count_group_median == "count < 5") %>%
select(tool, sensitivity)) ## Joining with `by = join_by(tool)`
sim_data_sens$dataset = factor(sim_data_sens$dataset, levels = c('pos', 'mixed', 'mix1', 'mix2', 'mix3'))
sim_data_sens$paper = factor(sim_data_sens$paper, levels = c('PMID 34645386', 'PMID 28594838'))
sim_data_sens %>%
ggplot(aes(sensitivity_sim, sensitivity, color = tool)) +
geom_point() +
facet_wrap(~paper+dataset, labeller = labeller(dataset = dataset.labs), scales = 'free') +
mytheme +
theme(legend.position = 'right') +
geom_abline(intercept = 0, slope = 1, color = '#999999', linetype = "dashed") +
scale_x_continuous(labels = scales::percent_format(), limits = c(0,1)) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,1)) +
xlab('sensitivity based on simulated data (PMID 28594838 and 34645386)') +
ylab('sensitivity based on orthogonal validation (from this manuscript)')